knitr::opts_chunk$set(echo = TRUE)
library(infotheo)
library(corrplot)
library(factoextra)
library(NbClust)
library(cluster)
library(plotly)
library(irr)
library(anytime)
library(dplyr)
library(ggdendro)
library(tidyverse)
Read in the Data
df.mapping.raw <- read_csv('../raw_map.csv')
Rows: 2406 Columns: 78── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (72): Q1concept_behav, Q1concept_behav_elaboration, Q1concept_behav_confidence, Q2intel_manip_1_elaboration, Q2intel_manip_1_confi...
dbl (5): Q2intel_manip_1, Q5creativity_input_1, Q21intellective_judg_1, createdAt, updatedAt
lgl (1): platform
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
task_map <- read_csv('../task_map.csv')
Rows: 102 Columns: 24── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): task
dbl (23): Q1concept_behav, Q3type_1_planning, Q4type_2_generate, Q6type_5_cc, Q7type_7_battle, Q8type_8_performance, Q9divisible_unita...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df.main_questions_summary <- read_csv('../main_question_summary.csv')
Rows: 2100 Columns: 8── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): task, task_blob_url, question_name, all_values
dbl (4): mean_rating, n_labels, agreement, general.alpha
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
task_based_summary <- df.main_questions_summary %>%
filter(n_labels > 10) %>%
group_by(task, task_blob_url) %>%
summarize(
mean_agreement = mean(agreement),
mean_alpha = mean(general.alpha)
) %>%
arrange(desc(mean_agreement))
`summarise()` has grouped output by 'task'. You can override using the `.groups` argument.
task_based_summary
mean(task_based_summary$mean_agreement)
[1] 0.8184401
sd(task_based_summary$mean_agreement)
[1] 0.05319443
quantile(task_based_summary$mean_agreement, c(0.025, 0.9725))
2.5% 97.25%
0.7179399 0.8974814
question_based_summary <- df.main_questions_summary %>%
filter(n_labels > 10) %>%
group_by(question_name) %>%
summarize(
mean_agreement = mean(agreement),
mean_alpha = mean(general.alpha)
) %>%
arrange(desc(mean_agreement))
question_based_summary
mean(question_based_summary$mean_agreement)
[1] 0.8184401
sd(question_based_summary$mean_agreement)
[1] 0.05856959
quantile(question_based_summary$mean_agreement, c(0.025, 0.9725), na.rm = T)
2.5% 97.25%
0.7274445 0.9251960
corrplot(abs(cor(task_map[-1])), method = "shade",
addCoef.col = TRUE,
tl.col = "black", type = 'lower', diag = FALSE)
task_map[-1] %>% as.matrix() %>% mean()
[1] 0.4468006
task_map[-1] %>% as.matrix() %>% median()
[1] 0.375
task_map[-1] %>% as.matrix() %>% sd()
[1] 0.3480186
task_map[-1] %>% as.matrix() %>% range()
[1] 0 1
df.confidence_scores_raw <- df.mapping.raw %>%
select(c(task, grep('confidence', names(df.mapping.raw)))) %>%
pivot_longer(-task, names_to = "question") %>%
mutate(
value = recode(
value,
"Very confident" = 5,
"Confident" = 4,
"Neutral" = 3,
"Not confident" = 2,
"Not at all confident" =1
)) %>%
mutate(question = gsub("_confidence", "", question))
# This is z-scored by individual user
df.confidence_scores_zscore <- df.mapping.raw %>%
select(c(task, user, grep('confidence', names(df.mapping.raw)))) %>%
pivot_longer(-c(task, user), names_to = "question") %>%
mutate(
value = recode(
value,
"Very confident" = 5,
"Confident" = 4,
"Neutral" = 3,
"Not confident" = 2,
"Not at all confident" =1
)) %>%
group_by(user) %>%
mutate(
value = scale(value)
) %>% mutate(question = gsub("_confidence", "", question)) %>% ungroup()
There is a very strong correlation between the confidence scores and the level of agreement – about 0.77. This relationship holds regardless of whether you z-score the confidence scores (which helps to account for individual-level variation in assigning confidence).
# Task-based confidence
zscored_confidence_by_task <- df.confidence_scores_zscore %>%
group_by(task) %>%
summarize(
mean_confidence = mean(value, na.rm = T)
)
task_based_confidence <- inner_join(task_based_summary, zscored_confidence_by_task, by = "task")
cor.test(task_based_confidence$mean_agreement, task_based_confidence$mean_confidence)
Pearson's product-moment correlation
data: task_based_confidence$mean_agreement and task_based_confidence$mean_confidence
t = 9.3126, df = 100, p-value = 3.181e-15
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5614424 0.7734686
sample estimates:
cor
0.6815061
# Question-based confidence
zscored_confidence_by_question <- df.confidence_scores_zscore %>%
group_by(question) %>%
summarize(
mean_confidence = mean(value, na.rm = T)
)
question_based_confidence <- inner_join(question_based_summary, zscored_confidence_by_question, by = c("question_name"="question"))
cor.test(question_based_confidence$mean_agreement, question_based_confidence$mean_confidence)
Pearson's product-moment correlation
data: question_based_confidence$mean_agreement and question_based_confidence$mean_confidence
t = 4.99, df = 18, p-value = 9.487e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4817497 0.9007125
sample estimates:
cor
0.761855
A version of the above with the original ordinal variables (non-normalized)
# Task-based confidence
confidence_by_task <- df.confidence_scores_raw %>%
group_by(task) %>%
summarize(
mean_confidence = mean(value, na.rm = T)
)
task_based_confidence <- inner_join(task_based_summary, confidence_by_task, by = "task")
cor.test(task_based_confidence$mean_agreement, task_based_confidence$mean_confidence)
Pearson's product-moment correlation
data: task_based_confidence$mean_agreement and task_based_confidence$mean_confidence
t = 6.2799, df = 100, p-value = 8.83e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3762576 0.6582166
sample estimates:
cor
0.531818
# Question-based confidence
confidence_by_question <- df.confidence_scores_raw %>%
group_by(question) %>%
summarize(
mean_confidence = mean(value, na.rm = T)
)
question_based_confidence <- inner_join(question_based_summary, confidence_by_question, by = c("question_name"="question"))
cor.test(question_based_confidence$mean_agreement, question_based_confidence$mean_confidence)
Pearson's product-moment correlation
data: question_based_confidence$mean_agreement and question_based_confidence$mean_confidence
t = 5.0528, df = 18, p-value = 8.284e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4890439 0.9024980
sample estimates:
cor
0.7658298
ggplot(task_based_confidence,
aes(x = mean_agreement,
y = mean_confidence)) +
geom_point() +
labs(title ="Per Task: Level of Rater Agreement v. Mean Normalized Rater Confidence")
Hierarchical Clustering
set_labels_params <- function(nbLabels,
direction = c("tb", "bt", "lr", "rl"),
fan = FALSE) {
if (fan) {
angle <- 360 / nbLabels * 1:nbLabels + 90
idx <- angle >= 90 & angle <= 270
angle[idx] <- angle[idx] + 180
hjust <- rep(0, nbLabels)
hjust[idx] <- 1
} else {
angle <- rep(0, nbLabels)
hjust <- 0
if (direction %in% c("tb", "bt")) { angle <- angle + 45 }
if (direction %in% c("tb", "rl")) { hjust <- 1 }
}
list(angle = angle, hjust = hjust, vjust = 0.5)
}
dendro_data_k <- function(hc, k) {
hcdata <- ggdendro::dendro_data(hc, type = "rectangle")
seg <- hcdata$segments
labclust <- cutree(hc, k)[hc$order]
segclust <- rep(0L, nrow(seg))
heights <- sort(hc$height, decreasing = TRUE)
height <- mean(c(heights[k], heights[k - 1L]), na.rm = TRUE)
for (i in 1:k) {
xi <- hcdata$labels$x[labclust == i]
idx1 <- seg$x >= min(xi) & seg$x <= max(xi)
idx2 <- seg$xend >= min(xi) & seg$xend <= max(xi)
idx3 <- seg$yend < height
idx <- idx1 & idx2 & idx3
segclust[idx] <- i
}
idx <- which(segclust == 0L)
segclust[idx] <- segclust[idx + 1L]
hcdata$segments$clust <- segclust
hcdata$segments$line <- as.integer(segclust < 1L)
hcdata$labels$clust <- labclust
hcdata
}
plot_ggdendro <- function(hcdata,
direction = c("lr", "rl", "tb", "bt"),
fan = FALSE,
scale.color = NULL,
branch.size = 1,
label.size = 3,
nudge.label = 0.01,
expand.y = 0.1) {
direction <- match.arg(direction) # if fan = FALSE
ybreaks <- pretty(segment(hcdata)$y, n = 5)
ymax <- max(segment(hcdata)$y)
## branches
p <- ggplot() +
geom_segment(data = segment(hcdata),
aes(x = x,
y = y,
xend = xend,
yend = yend,
linetype = factor(line),
colour = factor(clust)),
lineend = "round",
show.legend = FALSE,
size = branch.size)
## orientation
if (fan) {
p <- p +
coord_polar(direction = -1) +
scale_x_continuous(breaks = NULL,
limits = c(0, nrow(label(hcdata)))) +
scale_y_reverse(breaks = ybreaks)
} else {
p <- p + scale_x_continuous(breaks = NULL)
if (direction %in% c("rl", "lr")) {
p <- p + coord_flip()
}
if (direction %in% c("bt", "lr")) {
p <- p + scale_y_reverse(breaks = ybreaks)
} else {
p <- p + scale_y_continuous(breaks = ybreaks)
nudge.label <- -(nudge.label)
}
}
# labels
labelParams <- set_labels_params(nrow(hcdata$labels), direction, fan)
hcdata$labels$angle <- labelParams$angle
p <- p +
geom_text(data = label(hcdata),
aes(x = x,
y = y,
label = label,
colour = factor(clust),
angle = angle),
vjust = labelParams$vjust,
hjust = labelParams$hjust,
nudge_y = ymax * nudge.label,
size = label.size,
show.legend = FALSE)
# colors and limits
if (!is.null(scale.color)) {
p <- p + scale_color_manual(values = scale.color)
}
ylim <- -round(ymax * expand.y, 1)
p <- p + expand_limits(y = ylim)
p
}
set.seed(1)
# Dissimilarity matrix
d <- dist(task_map %>% column_to_rownames("task"), method = "euclidean")
# Hierarchical clustering using Complete Linkage
# Ward's method
hc5 <- hclust(d, method = "ward.D2" )
# get optimal number of clusters
NbClust(data = task_map %>% column_to_rownames("task"), distance = "euclidean", min.nc = 2, max.nc = 15, method = "ward.D2")
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 9 proposed 2 as the best number of clusters
* 3 proposed 3 as the best number of clusters
* 2 proposed 4 as the best number of clusters
* 2 proposed 7 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 2 proposed 9 as the best number of clusters
* 4 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
$All.index
KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
2 5.0282 54.1097 13.7944 12.5736 717.4633 25983.2072 48.4958 90.8314 1404.017 8.2830 0.3633 1.3478 0.3125 0.7722 17.4012 4.6682
3 0.9520 37.3073 14.2253 9.7025 866.4612 13566.6711 33.8802 79.8206 1444.717 9.4255 0.3356 1.6769 0.2283 0.7640 12.0491 4.8486
4 1.4898 32.8518 10.3127 9.6006 1013.2921 5717.1047 26.4159 69.7922 1527.323 10.7799 0.3521 1.5494 0.2167 0.7789 9.3683 4.4351
5 1.0936 29.5057 9.7490 9.8735 1149.2214 2356.3375 19.9609 63.1471 1646.044 11.9143 0.3665 1.6908 0.2022 0.6995 9.0202 6.5995
6 1.2302 27.6388 8.3527 10.0587 1261.0425 1133.6771 15.0749 57.3801 1711.120 13.1117 0.4247 1.5616 0.2126 0.6891 7.2175 6.8337
7 1.0029 26.1531 8.6021 9.9461 1453.1759 234.5955 12.5609 52.7872 1748.594 14.2526 0.4420 1.5222 0.2234 0.7584 11.4698 4.9897
8 1.5187 25.4054 6.1064 10.3907 1550.2107 118.3450 9.9807 48.4043 1771.909 15.5431 0.4189 1.5002 0.1887 0.7470 9.1452 5.2572
9 1.0486 24.1771 5.9626 10.3094 1638.2063 63.2103 8.5087 45.4517 1811.774 16.5528 0.4148 1.5246 0.1784 0.7636 4.6444 4.6722
10 1.1696 23.2782 5.2915 10.1884 1745.4337 27.2742 7.3348 42.7132 1873.966 17.6141 0.3887 1.5400 0.1856 0.6921 5.3386 6.6100
11 1.0495 22.4379 5.1614 10.0994 1825.5615 15.0442 6.2413 40.3901 1920.226 18.6272 0.3728 1.5016 0.1874 0.7489 5.0300 5.0601
12 1.0629 21.7822 4.9853 10.1053 1937.9781 5.9470 5.5491 38.2222 1967.643 19.6837 0.3546 1.4793 0.1856 0.2392 6.3608 34.1276
13 1.1606 21.2497 4.4459 10.1654 2047.9108 2.3755 4.9678 36.2161 2134.981 20.7740 0.3507 1.3955 0.1979 0.6875 4.9992 6.7056
14 1.0094 20.7017 4.4889 10.5390 2127.8937 1.2577 4.2905 34.4930 2318.485 21.8117 0.3372 1.3753 0.1967 0.8433 0.5573 2.2424
15 1.0104 20.2910 4.5415 10.9529 2288.9291 0.2977 3.9597 32.8190 2637.266 22.9243 0.3327 1.2518 0.2089 0.7060 2.9145 5.8640
Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
2 0.3145 45.4157 0.5859 1.7489 0.6301 0.2584 0.0166 3.5080 0.8981 0.7152
3 0.3129 26.6069 0.5097 0.0144 1.2816 0.2375 0.0165 3.9437 0.8352 0.6387
4 0.3006 17.4481 0.5564 0.4863 1.4480 0.2683 0.0225 4.1476 0.7853 0.5860
5 0.2941 12.6294 0.5459 0.1788 1.8756 0.2170 0.0234 4.1517 0.7503 0.5698
6 0.2805 9.5634 0.5539 0.0528 2.0283 0.2570 0.0236 3.8717 0.7160 0.5034
7 0.2745 7.5410 0.5660 1.0999 2.0988 0.2756 0.0241 3.9755 0.6919 0.4829
8 0.2661 6.0505 0.5078 5.4469 2.8884 0.2625 0.0257 3.9232 0.6597 0.4396
9 0.2574 5.0502 0.4345 0.1494 4.0769 0.2546 0.0266 4.4424 0.6359 0.4162
10 0.2484 4.2713 0.4326 0.1596 4.4875 0.2553 0.0274 4.4147 0.6179 0.4126
11 0.2419 3.6718 0.4298 0.2118 4.7576 0.2553 0.0275 4.3451 0.6016 0.3914
12 0.2342 3.1852 0.4216 0.0186 5.1975 0.2553 0.0285 4.2162 0.5853 0.3744
13 0.2270 2.7859 0.4230 0.2405 5.2121 0.2553 0.0286 3.9591 0.5678 0.3291
14 0.2205 2.4638 0.4140 0.0129 5.6305 0.2553 0.0289 3.9747 0.5540 0.3259
15 0.2150 2.1879 0.4156 0.0793 5.6431 0.2553 0.0292 3.7860 0.5401 0.2922
$All.CriticalValues
CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2 0.8536 10.1150 0.0000
3 0.8276 8.1259 0.0000
4 0.8157 7.4583 0.0000
5 0.7791 5.9556 0.0000
6 0.7539 5.2241 0.0000
7 0.8220 7.7974 0.0000
8 0.8002 6.7412 0.0000
9 0.7475 5.0662 0.0000
10 0.7246 4.5606 0.0000
11 0.7475 5.0662 0.0000
12 0.5089 1.9301 0.0000
13 0.7153 4.3791 0.0000
14 0.5578 2.3781 0.0053
15 0.6634 3.5522 0.0000
$Best.nc
KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette Duda
Number_clusters 2.0000 2.0000 4.0000 2.0000 7.0000 3.00 3.0000 4.0000 15.0000 8.0000 15.0000 15.0000 2.0000 9.0000
Value_Index 5.0282 54.1097 3.9126 12.5736 192.1334 4566.97 14.6156 3.3833 318.7813 -0.2808 0.3327 1.2518 0.3125 0.7636
PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
Number_clusters 9.0000 NA 2.0000 3.0000 2.0000 2.0000 2.0000 7.0000 0 2.000 0 15.0000
Value_Index 4.6444 NA 0.3145 18.8088 0.5859 1.7489 0.6301 0.2756 0 3.508 0 0.2922
$Best.partition
Categorization problem Mastermind
1 1
Logic Problem Sudoku
1 1
Rank cities by population, rank words by familiarity Shopping plan
1 1
Carter Racing (Experimenterless Version) Iterative lemonade stand task
1 1
Reading the mind in the eyes Writing story
1 2
Moral Reasoning (Disciplinary Action Case) Word construction from a subset of letters
2 2
Carter Racing Guessing the correlation
1 1
Wolf, goat and cabbage transfer Room assignment task
1 1
Arithmetic problem 1 Space Fortress
1 2
Visual Oddball Target The N light bulbs game
1 1
Word completion given starting letter Railroad Route Construction game
2 1
Allocating resources to programs Game of Clue - Terrorist Attack
2 1
Word completion given part of word NASA Moon survival
1 1
Image rating Estimating Factual Quantities
2 1
Run a mini business Recall videos
2 1
Search for Oil Task To evacuate or not to evacuate
1 2
Estimating geological metrics Euclidean traveling salesperson
1 1
Reproducing arts Estimating social quantity
1 1
Hidden figures in a picture (Searching Task) Estimating pages of a book
1 1
Abstract grid task Unscramble words (anagrams)
1 1
Random dot motion Target Search
1 1
Find the maximum Wildcam Gorongosa (Zooniverse)
1 1
Recall stories Recall association
1 1
Letters-to-numbers problems (cryptography) Architectural design task
1 2
Recall word lists Wason's Selection Task
1 1
Summarize Discussion Divergent Association Task
2 2
Crisis mapping 9 Dot Problem
1 1
The Fish game Advertisement writing
2 2
Hidden figures in a picture (Recall Task) Computer maze
1 1
Splitting a deck of cards Object based generalization for reasoning (Phyre)
1 1
Ravens Matrices Trivia Multiple Choice Quiz
1 1
Railroad Route Construction game (Impossible Version) Desert survival
2 1
Putting food into categories Wildcat Wells
2 2
Graph coloring task Husbands and wives transfer
1 1
Checkers Typing game
1 1
Recall images Whac-A-Mole
1 2
Oligopoly game Bullard Houses
2 1
Arithmetic problem 2 Find the common symbol
1 1
Blocks World for Teams Intergroup Prisoner's Dilemma
1 2
Minimum-effort tacit coordination game Public goods game
2 2
The beer game Pharmaceutical Company (hidden-profile)
2 1
Ultimatum game (various versions) New Recruit
2 2
Investment Game (aka Trust Game) Aerospace Investment (Role-playing)
2 2
Minimal Group Paradigm (study diversity) Volunteer Investment Game
2 2
Sender-Receiver game Iterated Snowdrift Game (With Punishment)
2 2
Dictator game and its variants Chicken
2 2
Battle of the sexes Apache helicopter flight simulator (Longbow2)
2 2
Prisoner's Dilemma (various versions) Mock jury
2 2
Biopharm Seltek Iterated Snowdrift Game (Without Punishment)
2 2
Investment game (hidden-profile) Organization Game
1 2
TOPSIM - general mgmt business game Best job candidate (hidden-profile)
2 1
# Plot the obtained dendrogram
colors = c( "#118AB2", "#A53860", "#073B4C", "#9071EE", "#209A92", "#3E885B", "#CC9328")
hcdata <- dendro_data_k(hc5, 2)
p <- plot_ggdendro(hcdata,
direction = "lr",
scale.color = colors,
label.size = 10,
branch.size = 2,
expand.y = 4) + theme_void()
p
df.mcg <- task_map %>%
select(
task,
Q1concept_behav,
Q20type_3_type_4,
Q3type_1_planning,
Q4type_2_generate,
Q6type_5_cc,
Q7type_7_battle,
Q8type_8_performance
)
ggplot(
df.mcg %>%
rename(
Physical = Q1concept_behav,
Intellective = Q20type_3_type_4,
Planning = Q3type_1_planning,
Generative = Q4type_2_generate,
`Cognitive Conflict` = Q6type_5_cc,
Battle = Q7type_7_battle,
Performance = Q8type_8_performance
) %>%
pivot_longer(cols = -task) %>%
rename(`Mean Rater Response` = value),
aes(x = name, y = task)
) + geom_tile(aes(fill = `Mean Rater Response`)) + scale_fill_gradient(low = "#CC3363",
high = "#07BEB8") + theme(axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
)) +
labs(x = "Dimension in McGrath's Taxonomy",
y = "Task Name")
ggsave("26task-mcgrath-ratings.png")
Saving 14 x 18 in image
McGrath - within v. between-category variance
Physical = Q1concept_behav,
Intellective = Q20type_3_type_4,
Planning = Q3type_1_planning,
Generative = Q4type_2_generate,
`Cognitive Conflict` = Q6type_5_cc,
Battle = Q7type_7_battle,
Performance = Q8type_8_performance
task_map_discrete <- cbind(task_map$task, discretize(task_map[-1], nbins = 10)) %>%
rename(task = `task_map$task`)
df.mcg <- task_map_discrete %>%
select(
task,
Q1concept_behav,
Q20type_3_type_4,
Q3type_1_planning,
Q4type_2_generate,
Q6type_5_cc,
Q7type_7_battle,
Q8type_8_performance
)
df.laughlin <- task_map_discrete %>%
select(
task,
Q15dec_verifiability,
Q16shared_knowledge,
Q17within_sys_sol,
Q18ans_recog,
Q19time_solvability,
Q21intellective_judg_1,
Q24eureka_question
)
df.shaw <- task_map_discrete %>%
select(
task,
Q2intel_manip_1,
Q13outcome_multip,
Q14sol_scheme_mul
)
df.steiner <- task_map_discrete %>%
select(
task,
Q9divisible_unitary,
Q10maximizing,
Q11optimizing
)
df.zigurs <- task_map_discrete %>%
select(
task,
Q13outcome_multip,
Q14sol_scheme_mul,
Q22confl_tradeoffs,
Q23ss_out_uncert
)
for documentation, see: https://cran.r-project.org/web/packages/infotheo/infotheo.pdf
Confirming discretization still looks good (qualitatively)
pca <- task_map_discrete %>% #select(-continuous_questions) %>%
select(-task) %>%
prcomp(center = T)
kmeans_output <- pca$x %>% # 2 is the optimal number
kmeans(centers = 3, nstart = 100)
combined_data <- cbind(task_map,
pca$x, factor(kmeans_output$cluster)) %>%
rename(cluster = `factor(kmeans_output$cluster)`)
plot_ly(
x = combined_data$PC1,
y = combined_data$PC2,
z = combined_data$PC3,
type = "scatter3d",
mode = "markers", # can use mode = "text"
text = combined_data$task ,
color = combined_data$cluster
)
#total correlation (also known as multi-information)
multiinformation(task_map_discrete[-1])
[1] 45.7523
multiinformation(df.mcg[-1])
[1] 9.778726
multiinformation(df.laughlin[-1])
[1] 10.70914
multiinformation(df.shaw[-1])
[1] 2.313343
multiinformation(df.steiner[-1])
[1] 2.315579
multiinformation(df.zigurs[-1])
[1] 4.553186
# maybe don't run? takes forever, likely due to calculation of many conditional probabilities. also, negative and not interpretable
# interaction information
# interinformation(task_map_discrete[-1])
# interinformation(df.mcg[-1])
# interinformation(df.laughlin[-1])
# interinformation(df.shaw[-1])
# interinformation(df.steiner[-1])
# interinformation(df.zigurs[-1])
# entropy?
entropy(task_map_discrete[-1])
[1] 4.624973
entropy(df.mcg[-1])
[1] 4.624973
entropy(df.laughlin[-1])
[1] 4.611382
entropy(df.shaw[-1])
[1] 4.551887
entropy(df.steiner[-1])
[1] 4.584199
entropy(df.zigurs[-1])
[1] 4.611382
Notes on how this is supposed to work:
from https://arxiv.org/pdf/1701.08868.pdf > In the case of three random variables, interaction information is the gain (or loss) in information transmitted between any two of the variables, due to additional knowledge of the third random variable. That is, interaction information is the difference between the conditional and unconditional mutual information between two of the variables, where the conditioning is on the third variable. It is important to note that unlike (conditional) mutual information which is always non-negative, interaction information can be negative.